Introduction


Business Understanding

Problem statement

To develop models for an insurance company using the Heart Disease dataset from the UCI Machine Learning Repository. The goal is to predict the likelihood of a person developing heart disease, which would help the insurance company estimate health risks and adjust premiums accordingly.


Data Understanding

The dataset contains various features related to patients’ health and demographic information. We will explore the dataset to understand its structure and relationships between variables.

Data dictionary

The dataset contains 14 key attributes that are either numerical or categorical.

Attribute Type Description Constraints/ Rules
age Numerical The age of the patient in years Range: 29-77 (based on dataset statistics)
sex Categorical The gender of the patient Values: 1 = Male, 0 = Female
cp Categorical Type of chest pain experienced by the patient Values: 1 = Typical angina, 2 = Atypical angina, 3 = Non-anginal pain, 4 = Asymptomatic
trestbps Numerical Resting blood pressure of the patient, measured in mmHg Range: Typically, between 94 and 200 mmHg
chol Numerical Serum cholesterol level in mg/dl Range: Typically, between 126 and 564 mg/dl
fbs Categorical Fasting blood sugar level > 120 mg/dl Values: 1 = True, 0 = False
restecg Categorical Results of the patient’s resting electrocardiogram Values: 0 = Normal, 1 = ST-T wave abnormality, 2 = Probable or definite left ventricular hypertrophy
thalach Numerical Maximum heart rate achieved during a stress test Range: Typically, between 71 and 202 bpm
exang Categorical Whether the patient experiences exercise-induced angina Values: 1 = Yes, 0 = No
oldpeak Numerical ST depression induced by exercise relative to rest (an ECG measure) Range: 0.0 to 6.2 (higher values indicate more severe abnormalities)
slope Categorical Slope of the peak exercise ST segment Values: 1 = Upsloping, 2 = Flat, 3 = Downsloping
ca Numerical Number of major vessels colored by fluoroscopy Range: 0-3
thal Categorical Blood disorder variable related to thalassemia Values: 3 = Normal, 6 = Fixed defect, 7 = Reversible defect
target Categorical Diagnosis of heart disease Values: 0 = No heart disease, 1 = Presence of heart disease


Data Preparation

Data loading

Load the dataset from the UCI website to memory

# Load the dataset
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"

# Read the dataset into a dataframe
Heart.df <- read.csv(text = getURL(url), header = FALSE, na.strings = "?")

Rename the columns into a meaningful column names

colnames(Heart.df) <- c("age", "sex", "cp", "trestbps", "chol", "fbs",
                        "restecg", "thalach", "exang", "oldpeak", 
                        "slope", "ca", "thal", "target")

Display dimensions of the dataset

dim(Heart.df)
## [1] 303  14

Display the first six rows of the dataset

head(Heart.df)
##   age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1  63   1  1      145  233   1       2     150     0     2.3     3  0    6
## 2  67   1  4      160  286   0       2     108     1     1.5     2  3    3
## 3  67   1  4      120  229   0       2     129     1     2.6     2  2    7
## 4  37   1  3      130  250   0       0     187     0     3.5     3  0    3
## 5  41   0  2      130  204   0       2     172     0     1.4     1  0    3
## 6  56   1  2      120  236   0       0     178     0     0.8     1  0    3
##   target
## 1      0
## 2      2
## 3      1
## 4      0
## 5      0
## 6      0

Display the structure of the dataframe

glimpse(Heart.df)
## Rows: 303
## Columns: 14
## $ age      <dbl> 63, 67, 67, 37, 41, 56, 62, 57, 63, 53, 57, 56, 56, 44, 52, 5…
## $ sex      <dbl> 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1…
## $ cp       <dbl> 1, 4, 4, 3, 2, 2, 4, 4, 4, 4, 4, 2, 3, 2, 3, 3, 2, 4, 3, 2, 1…
## $ trestbps <dbl> 145, 160, 120, 130, 130, 120, 140, 120, 130, 140, 140, 140, 1…
## $ chol     <dbl> 233, 286, 229, 250, 204, 236, 268, 354, 254, 203, 192, 294, 2…
## $ fbs      <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0…
## $ restecg  <dbl> 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2…
## $ thalach  <dbl> 150, 108, 129, 187, 172, 178, 160, 163, 147, 155, 148, 153, 1…
## $ exang    <dbl> 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1…
## $ oldpeak  <dbl> 2.3, 1.5, 2.6, 3.5, 1.4, 0.8, 3.6, 0.6, 1.4, 3.1, 0.4, 1.3, 0…
## $ slope    <dbl> 3, 2, 2, 3, 1, 1, 3, 1, 2, 3, 2, 2, 2, 1, 1, 1, 3, 1, 1, 1, 2…
## $ ca       <dbl> 0, 3, 2, 0, 0, 0, 2, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ thal     <dbl> 6, 3, 7, 3, 3, 3, 3, 3, 7, 7, 6, 3, 6, 7, 7, 3, 7, 3, 3, 3, 3…
## $ target   <int> 0, 2, 1, 0, 0, 0, 3, 0, 2, 1, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0…

Display the statistical summary of the dataframe

summary(Heart.df)
##       age             sex               cp           trestbps    
##  Min.   :29.00   Min.   :0.0000   Min.   :1.000   Min.   : 94.0  
##  1st Qu.:48.00   1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:120.0  
##  Median :56.00   Median :1.0000   Median :3.000   Median :130.0  
##  Mean   :54.44   Mean   :0.6799   Mean   :3.158   Mean   :131.7  
##  3rd Qu.:61.00   3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:140.0  
##  Max.   :77.00   Max.   :1.0000   Max.   :4.000   Max.   :200.0  
##                                                                  
##       chol            fbs            restecg          thalach     
##  Min.   :126.0   Min.   :0.0000   Min.   :0.0000   Min.   : 71.0  
##  1st Qu.:211.0   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:133.5  
##  Median :241.0   Median :0.0000   Median :1.0000   Median :153.0  
##  Mean   :246.7   Mean   :0.1485   Mean   :0.9901   Mean   :149.6  
##  3rd Qu.:275.0   3rd Qu.:0.0000   3rd Qu.:2.0000   3rd Qu.:166.0  
##  Max.   :564.0   Max.   :1.0000   Max.   :2.0000   Max.   :202.0  
##                                                                   
##      exang           oldpeak         slope             ca        
##  Min.   :0.0000   Min.   :0.00   Min.   :1.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.00   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.80   Median :2.000   Median :0.0000  
##  Mean   :0.3267   Mean   :1.04   Mean   :1.601   Mean   :0.6722  
##  3rd Qu.:1.0000   3rd Qu.:1.60   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :6.20   Max.   :3.000   Max.   :3.0000  
##                                                  NA's   :4       
##       thal           target      
##  Min.   :3.000   Min.   :0.0000  
##  1st Qu.:3.000   1st Qu.:0.0000  
##  Median :3.000   Median :0.0000  
##  Mean   :4.734   Mean   :0.9373  
##  3rd Qu.:7.000   3rd Qu.:2.0000  
##  Max.   :7.000   Max.   :4.0000  
##  NA's   :2

Data preprocessing

We will preprocess the data by handling missing values, encoding categorical variables, and scaling numerical features.

Convert binary variables to 0’s and 1’s.

Heart.df$target <- ifelse(Heart.df$target > 0, 1, 0)
Heart.df$sex <- ifelse(Heart.df$sex > 0, 1, 0)
Heart.df$fbs <- ifelse(Heart.df$fbs > 0, 1, 0)
Heart.df$exang <- ifelse(Heart.df$exang > 0, 1, 0)

Handle missing values in ca and thal variables using mean/mode imputation.

Heart.df$ca[is.na(Heart.df$ca)] <- median(Heart.df$ca, na.rm = TRUE)
Heart.df$ca[Heart.df$ca == "?"] <- median(Heart.df$ca, na.rm = TRUE)
#Heart.df$thal[is.na(Heart.df$thal)] <- median(Heart.df$thal, na.rm = TRUE)
Heart.df$ca[Heart.df$thal == "?"] <- median(Heart.df$thal, na.rm = TRUE)

Check for missing values if still exist

sapply(Heart.df, function(x) sum(is.na(x)))
##      age      sex       cp trestbps     chol      fbs  restecg  thalach 
##        0        0        0        0        0        0        0        0 
##    exang  oldpeak    slope       ca     thal   target 
##        0        0        0        0        2        0

Check for duplicate entries and print them if they exist.

dupes <- Heart.df[duplicated(Heart.df) | duplicated(Heart.df, fromLast = TRUE), ]
# Print or inspect the duplicate entries
print(dupes)
##  [1] age      sex      cp       trestbps chol     fbs      restecg  thalach 
##  [9] exang    oldpeak  slope    ca       thal     target  
## <0 rows> (or 0-length row.names)

Convert categorical variables to factor.

Heart.df$sex <- factor(Heart.df$sex, 
                       levels = c(0, 1), 
                       labels = c("Female", "Male"))
Heart.df$cp <- factor(Heart.df$cp, 
                      levels = c(1, 2, 3, 4), 
                      labels = c("Typical Angina", "Atypical Angina", 
                                 "Non-Angina", "Asymptomatic"))
Heart.df$fbs <- factor(Heart.df$fbs, 
                       levels = c(0, 1), 
                       labels = c("False", "True"))
Heart.df$restecg <- factor(Heart.df$restecg, 
                           levels = c(0, 1, 2), 
                           labels = c("Normal", "Wave-abnormality", "Probable"))
Heart.df$exang <- factor(Heart.df$exang, 
                         levels = c(0, 1), 
                         labels = c("No", "Yes"))
Heart.df$slope <- factor(Heart.df$slope, 
                         levels = c(1, 2, 3), 
                         labels = c("Upsloping", "Flat", "Downsloping"))
Heart.df$thal <- factor(Heart.df$thal, 
                        levels = c(3, 6, 7), 
                        labels = c("Normal", "Fixed Defect", "Reversible"))
Heart.df$target <- factor(Heart.df$target, 
                          levels = c(0, 1), 
                          labels = c("No", "Yes"))

Exploratory data analysis

Plot distribution of target variable.

ggplot(Heart.df, aes(x=target, fill=target)) +
  geom_bar() + theme_test() +
  labs(title="Distribution of Heart Disease",
       x = "Heart Disease", fill = "Heart Disease")

Visualize age distribution by heart disease presence.

ggplot(Heart.df, aes(x = age, fill = as.factor(target))) + 
  geom_histogram(bins = 15) + theme_test() +
  labs(title = "Age Distribution by Heart Disease", 
       x = "Age", fill = "Heart Disease")

Visualize chol distribution by heart disease.

ggplot(Heart.df, aes(x = chol, fill = target)) + 
  geom_histogram(bins = 15) + theme_test() +
  labs(title = "Cholesterol Distribution by Heart Disease", 
       x = "Cholesterol", fill = "Heart Disease")

Visualize tresbps distribution by heart disease.

ggplot(Heart.df, aes(x = trestbps, fill = as.factor(target))) + 
  geom_histogram(bins = 15) + theme_test() +
  labs(title = "Resting Blood Pressure Distribution by Heart Disease", 
       x = "Resting Blood Pressure", fill = "Heart Disease")


Modeling


Evaluation


Deployment


Conclusion